#Import all the necessary modules
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.impute import KNNImputer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from sklearn import metrics
df_vehicle = pd.read_csv('vehicle.csv')
df_vehicle.head(10)
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 95 | 48.0 | 83.0 | 178.0 | 72.0 | 10 | 162.0 | 42.0 | 20.0 | 159 | 176.0 | 379.0 | 184.0 | 70.0 | 6.0 | 16.0 | 187.0 | 197 | van |
| 1 | 91 | 41.0 | 84.0 | 141.0 | 57.0 | 9 | 149.0 | 45.0 | 19.0 | 143 | 170.0 | 330.0 | 158.0 | 72.0 | 9.0 | 14.0 | 189.0 | 199 | van |
| 2 | 104 | 50.0 | 106.0 | 209.0 | 66.0 | 10 | 207.0 | 32.0 | 23.0 | 158 | 223.0 | 635.0 | 220.0 | 73.0 | 14.0 | 9.0 | 188.0 | 196 | car |
| 3 | 93 | 41.0 | 82.0 | 159.0 | 63.0 | 9 | 144.0 | 46.0 | 19.0 | 143 | 160.0 | 309.0 | 127.0 | 63.0 | 6.0 | 10.0 | 199.0 | 207 | van |
| 4 | 85 | 44.0 | 70.0 | 205.0 | 103.0 | 52 | 149.0 | 45.0 | 19.0 | 144 | 241.0 | 325.0 | 188.0 | 127.0 | 9.0 | 11.0 | 180.0 | 183 | bus |
| 5 | 107 | NaN | 106.0 | 172.0 | 50.0 | 6 | 255.0 | 26.0 | 28.0 | 169 | 280.0 | 957.0 | 264.0 | 85.0 | 5.0 | 9.0 | 181.0 | 183 | bus |
| 6 | 97 | 43.0 | 73.0 | 173.0 | 65.0 | 6 | 153.0 | 42.0 | 19.0 | 143 | 176.0 | 361.0 | 172.0 | 66.0 | 13.0 | 1.0 | 200.0 | 204 | bus |
| 7 | 90 | 43.0 | 66.0 | 157.0 | 65.0 | 9 | 137.0 | 48.0 | 18.0 | 146 | 162.0 | 281.0 | 164.0 | 67.0 | 3.0 | 3.0 | 193.0 | 202 | van |
| 8 | 86 | 34.0 | 62.0 | 140.0 | 61.0 | 7 | 122.0 | 54.0 | 17.0 | 127 | 141.0 | 223.0 | 112.0 | 64.0 | 2.0 | 14.0 | 200.0 | 208 | van |
| 9 | 93 | 44.0 | 98.0 | NaN | 62.0 | 11 | 183.0 | 36.0 | 22.0 | 146 | 202.0 | 505.0 | 152.0 | 64.0 | 4.0 | 14.0 | 195.0 | 204 | car |
df_vehicle.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 846 entries, 0 to 845 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 compactness 846 non-null int64 1 circularity 841 non-null float64 2 distance_circularity 842 non-null float64 3 radius_ratio 840 non-null float64 4 pr.axis_aspect_ratio 844 non-null float64 5 max.length_aspect_ratio 846 non-null int64 6 scatter_ratio 845 non-null float64 7 elongatedness 845 non-null float64 8 pr.axis_rectangularity 843 non-null float64 9 max.length_rectangularity 846 non-null int64 10 scaled_variance 843 non-null float64 11 scaled_variance.1 844 non-null float64 12 scaled_radius_of_gyration 844 non-null float64 13 scaled_radius_of_gyration.1 842 non-null float64 14 skewness_about 840 non-null float64 15 skewness_about.1 845 non-null float64 16 skewness_about.2 845 non-null float64 17 hollows_ratio 846 non-null int64 18 class 846 non-null object dtypes: float64(14), int64(4), object(1) memory usage: 125.7+ KB
df_vehicle.shape
(846, 19)
percent_missing = df_vehicle.isnull().sum() * 100 / len(df_vehicle)
missing_value_df = pd.DataFrame({'column_name': df_vehicle.columns,
'percent_missing': percent_missing})
missing_value_df
| column_name | percent_missing | |
|---|---|---|
| compactness | compactness | 0.000000 |
| circularity | circularity | 0.591017 |
| distance_circularity | distance_circularity | 0.472813 |
| radius_ratio | radius_ratio | 0.709220 |
| pr.axis_aspect_ratio | pr.axis_aspect_ratio | 0.236407 |
| max.length_aspect_ratio | max.length_aspect_ratio | 0.000000 |
| scatter_ratio | scatter_ratio | 0.118203 |
| elongatedness | elongatedness | 0.118203 |
| pr.axis_rectangularity | pr.axis_rectangularity | 0.354610 |
| max.length_rectangularity | max.length_rectangularity | 0.000000 |
| scaled_variance | scaled_variance | 0.354610 |
| scaled_variance.1 | scaled_variance.1 | 0.236407 |
| scaled_radius_of_gyration | scaled_radius_of_gyration | 0.236407 |
| scaled_radius_of_gyration.1 | scaled_radius_of_gyration.1 | 0.472813 |
| skewness_about | skewness_about | 0.709220 |
| skewness_about.1 | skewness_about.1 | 0.118203 |
| skewness_about.2 | skewness_about.2 | 0.118203 |
| hollows_ratio | hollows_ratio | 0.000000 |
| class | class | 0.000000 |
Visualize a Pie-chart and print percentage of values for variable ‘class’
plt.pie(df_vehicle['class'].value_counts(), autopct = '%.1f%%',labels=list(df_vehicle['class'].value_counts().index));
plt.suptitle('class')
plt.show()
#The data is not distrubuted equally across the 3 types of classes.
#Approximately Half of the data points are of type "car" and remaining from the types "bus" and "van" in appox. equal proportions.
df_vehicle_ = df_vehicle.drop('class',axis=1)
df_vehicle_.columns
Index(['compactness', 'circularity', 'distance_circularity', 'radius_ratio',
'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scatter_ratio',
'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity',
'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration',
'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1',
'skewness_about.2', 'hollows_ratio'],
dtype='object')
# create an object for KNNImputer
imputer = KNNImputer(n_neighbors=2)
df_imputed = imputer.fit_transform(df_vehicle_)
# print dataset after performing the operation
print("\n\nAfter performing imputation\n",df_imputed)
After performing imputation [[ 95. 48. 83. ... 16. 187. 197.] [ 91. 41. 84. ... 14. 189. 199.] [104. 50. 106. ... 9. 188. 196.] ... [106. 54. 101. ... 4. 187. 201.] [ 86. 36. 78. ... 25. 190. 195.] [ 85. 36. 66. ... 18. 186. 190.]]
df = pd.DataFrame(df_imputed,columns=[ 'compactness', 'circularity', 'distance_circularity', 'radius_ratio',
'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scatter_ratio',
'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity',
'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration',
'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1',
'skewness_about.2', 'hollows_ratio'])
df.isnull().sum()
compactness 0 circularity 0 distance_circularity 0 radius_ratio 0 pr.axis_aspect_ratio 0 max.length_aspect_ratio 0 scatter_ratio 0 elongatedness 0 pr.axis_rectangularity 0 max.length_rectangularity 0 scaled_variance 0 scaled_variance.1 0 scaled_radius_of_gyration 0 scaled_radius_of_gyration.1 0 skewness_about 0 skewness_about.1 0 skewness_about.2 0 hollows_ratio 0 dtype: int64
Check for duplicate rows in the data and impute with correct approach.
# Check for duplicate data
df.duplicated().sum()
0
A. Split data into X and Y.(Train and Test optional)
B. Standardize the Data.
X = df
y = df_vehicle[['class']]
print("shape of independant data: ", X.shape)
print("shape of dependant data: ", y.shape)
shape of independant data: (846, 18) shape of dependant data: (846, 1)
# encoding the class attribute.
y.replace({'car':0,'bus':1,'van':2},inplace=True)
from scipy.stats import zscore
XScaled=X.apply(zscore)
XScaled.head()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.160580 | 0.508796 | 0.057491 | 0.269777 | 1.306579 | 0.311542 | -0.205821 | 0.136570 | -0.224944 | 0.758332 | -0.402275 | -0.344841 | 0.285812 | -0.327777 | -0.074607 | 0.380544 | -0.313482 | 0.183957 |
| 1 | -0.325470 | -0.626505 | 0.120944 | -0.835966 | -0.595507 | 0.094079 | -0.597077 | 0.520843 | -0.610954 | -0.344578 | -0.593442 | -0.622348 | -0.513516 | -0.060372 | 0.537193 | 0.156469 | 0.011426 | 0.452977 |
| 2 | 1.254193 | 0.833168 | 1.516917 | 1.196210 | 0.545744 | 0.311542 | 1.148529 | -1.144341 | 0.933086 | 0.689401 | 1.095198 | 1.104988 | 1.392573 | 0.073331 | 1.556859 | -0.403720 | -0.151028 | 0.049447 |
| 3 | -0.082445 | -0.626505 | -0.005963 | -0.298037 | 0.165327 | 0.094079 | -0.747561 | 0.648935 | -0.610954 | -0.344578 | -0.912053 | -0.741279 | -1.466561 | -1.263695 | -0.074607 | -0.291682 | 1.635964 | 1.529056 |
| 4 | -1.054545 | -0.139948 | -0.767403 | 1.076670 | 5.237557 | 9.444962 | -0.597077 | 0.520843 | -0.610954 | -0.275646 | 1.668698 | -0.650665 | 0.408785 | 7.293271 | 0.537193 | -0.179644 | -1.450659 | -1.699181 |
X_train, X_test, y_train, y_test = train_test_split(XScaled, y, test_size=0.30, random_state=3)
A. Train a base Classification model using SVM.
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
svm_model_1 = svm.SVC(gamma=0.025, C=3, kernel= 'linear')
svm_model_1.fit(X_train , y_train)
SVC(C=3, gamma=0.025, kernel='linear')
B. Print Classification metrics for train data.
y_predict_1 = svm_model_1.predict(X_test)
train_score_1 = svm_model_1.score(X_train,y_train)
test_score_1 = svm_model_1.score(X_test, y_test)
print("SVM_model_1 score for train set:", train_score_1*100)
print("SVM_model_1 score for test set:", test_score_1*100)
SVM_model_1 score for train set: 97.46621621621621 SVM_model_1 score for test set: 95.66929133858267
target_names = ['car', 'bus', 'van']
print("\nClassification Report:\n", classification_report(y_test, y_predict_1, target_names=target_names))
Classification Report:
precision recall f1-score support
car 0.98 0.96 0.97 123
bus 0.94 0.94 0.94 71
van 0.92 0.97 0.94 60
accuracy 0.96 254
macro avg 0.95 0.96 0.95 254
weighted avg 0.96 0.96 0.96 254
Apply PCA on the data with 10 components.?
Before applying PCA clustering it is necessary that we check for outliers and remove them.
XScaled.boxplot()
<AxesSubplot:>
#18 columns are plotted and its so crowded. still we could see there are outliers in certain features,
#which should be treated for removal of outliers.
# we define outliers by using Inter Quantile range.
# Data_point > (Q3 * 1.5) is said to be outlier where Q3 is 75% Quantile !
# finding the IQR for each of the numerical columns
def check_outliers(XScaled):
vData_num = XScaled.loc[:,]
Q1 = vData_num.quantile(0.25)
Q3 = vData_num.quantile(0.75)
IQR = Q3 - Q1
count = 0
# checking for outliers, True represents outlier
vData_num_mod = ((vData_num < (Q1 - 1.5 * IQR)) |(vData_num > (Q3 + 1.5 * IQR)))
#iterating over columns to check for no.of outliers in each of the numerical attributes.
for col in vData_num_mod:
if(1 in vData_num_mod[col].value_counts().index):
print("No. of outliers in %s: %d" %( col, vData_num_mod[col].value_counts().iloc[1]))
count += 1
print("\n\nNo of attributes with outliers are :", count)
check_outliers(XScaled)
No. of outliers in radius_ratio: 3 No. of outliers in pr.axis_aspect_ratio: 8 No. of outliers in max.length_aspect_ratio: 13 No. of outliers in scaled_variance: 1 No. of outliers in scaled_variance.1: 2 No. of outliers in scaled_radius_of_gyration.1: 15 No. of outliers in skewness_about: 12 No. of outliers in skewness_about.1: 2 No of attributes with outliers are : 8
# There 8 attributes which have outliers.
#Also,the no.of outliers are very low(< 20) w.r.t the total datapoints (846).
#Hence, we will replace them with the median of the respective attributes.
df_vehicle_clean = XScaled
# to replace with median we will loop through each column in the dataframe
for col in df_vehicle_clean.columns[:-1]:
Q1 = df_vehicle_clean[col].quantile(0.25)
Q3 = df_vehicle_clean[col].quantile(0.75)
IQR = Q3 - Q1
lower_value = Q1 - (1.5 * IQR)
upper_value = Q3 + (1.5 * IQR)
df_vehicle_clean.loc[(df_vehicle_clean[col]< lower_value) | ( df_vehicle_clean[col] > upper_value), col] = df_vehicle_clean[col].median()
check_outliers(df_vehicle_clean)
No of attributes with outliers are : 0
plt.figure(figsize= (15,15))
df_vehicle_clean.boxplot()
plt.xticks(rotation = 90)
(array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18]),
[Text(1, 0, 'compactness'),
Text(2, 0, 'circularity'),
Text(3, 0, 'distance_circularity'),
Text(4, 0, 'radius_ratio'),
Text(5, 0, 'pr.axis_aspect_ratio'),
Text(6, 0, 'max.length_aspect_ratio'),
Text(7, 0, 'scatter_ratio'),
Text(8, 0, 'elongatedness'),
Text(9, 0, 'pr.axis_rectangularity'),
Text(10, 0, 'max.length_rectangularity'),
Text(11, 0, 'scaled_variance'),
Text(12, 0, 'scaled_variance.1'),
Text(13, 0, 'scaled_radius_of_gyration'),
Text(14, 0, 'scaled_radius_of_gyration.1'),
Text(15, 0, 'skewness_about'),
Text(16, 0, 'skewness_about.1'),
Text(17, 0, 'skewness_about.2'),
Text(18, 0, 'hollows_ratio')])
# Check for correlation of variable
df_vehicle_clean.corr(method='pearson')
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| compactness | 1.000000 | 0.692741 | 0.792419 | 0.723777 | 0.195129 | 0.499928 | 0.812999 | -0.788647 | 0.813437 | 0.676143 | 0.772311 | 0.810684 | 0.585339 | -0.249401 | 0.196574 | 0.157198 | 0.298048 | 0.365552 |
| circularity | 0.692741 | 1.000000 | 0.798615 | 0.641114 | 0.197513 | 0.552733 | 0.860340 | -0.828716 | 0.857871 | 0.965754 | 0.815189 | 0.844442 | 0.936122 | 0.079163 | 0.137671 | -0.008412 | -0.112606 | 0.038354 |
| distance_circularity | 0.792419 | 0.798615 | 1.000000 | 0.799847 | 0.247111 | 0.665384 | 0.907757 | -0.912704 | 0.895803 | 0.774811 | 0.873097 | 0.888063 | 0.706139 | -0.234076 | 0.099782 | 0.266181 | 0.147905 | 0.334637 |
| radius_ratio | 0.723777 | 0.641114 | 0.799847 | 1.000000 | 0.652133 | 0.461059 | 0.775128 | -0.829327 | 0.748371 | 0.581791 | 0.791588 | 0.769103 | 0.555172 | -0.389812 | 0.036577 | 0.182074 | 0.404102 | 0.488645 |
| pr.axis_aspect_ratio | 0.195129 | 0.197513 | 0.247111 | 0.652133 | 1.000000 | 0.148919 | 0.196839 | -0.300783 | 0.163183 | 0.150505 | 0.209640 | 0.200477 | 0.151762 | -0.318690 | -0.055916 | -0.024705 | 0.399931 | 0.414940 |
| max.length_aspect_ratio | 0.499928 | 0.552733 | 0.665384 | 0.461059 | 0.148919 | 1.000000 | 0.491563 | -0.504038 | 0.488753 | 0.642713 | 0.403117 | 0.466778 | 0.397505 | -0.334128 | 0.080716 | 0.141116 | 0.081825 | 0.413174 |
| scatter_ratio | 0.812999 | 0.860340 | 0.907757 | 0.775128 | 0.196839 | 0.491563 | 1.000000 | -0.973397 | 0.992085 | 0.810658 | 0.964146 | 0.983795 | 0.800928 | 0.010510 | 0.062686 | 0.213200 | 0.004585 | 0.119962 |
| elongatedness | -0.788647 | -0.828716 | -0.912704 | -0.829327 | -0.300783 | -0.504038 | -0.973397 | 1.000000 | -0.950512 | -0.775531 | -0.949695 | -0.951540 | -0.766137 | 0.079733 | -0.044539 | -0.186517 | -0.112689 | -0.216725 |
| pr.axis_rectangularity | 0.813437 | 0.857871 | 0.895803 | 0.748371 | 0.163183 | 0.488753 | 0.992085 | -0.950512 | 1.000000 | 0.813305 | 0.951494 | 0.977838 | 0.798280 | 0.027241 | 0.071039 | 0.215075 | -0.021594 | 0.098519 |
| max.length_rectangularity | 0.676143 | 0.965754 | 0.774811 | 0.581791 | 0.150505 | 0.642713 | 0.810658 | -0.775531 | 0.813305 | 1.000000 | 0.752504 | 0.793065 | 0.866495 | 0.053969 | 0.128692 | 0.008364 | -0.106973 | 0.076770 |
| scaled_variance | 0.772311 | 0.815189 | 0.873097 | 0.791588 | 0.209640 | 0.403117 | 0.964146 | -0.949695 | 0.951494 | 0.752504 | 1.000000 | 0.948178 | 0.787207 | 0.025153 | 0.026230 | 0.197383 | 0.012477 | 0.086058 |
| scaled_variance.1 | 0.810684 | 0.844442 | 0.888063 | 0.769103 | 0.200477 | 0.466778 | 0.983795 | -0.951540 | 0.977838 | 0.793065 | 0.948178 | 1.000000 | 0.785500 | 0.005136 | 0.064524 | 0.210711 | 0.017060 | 0.122041 |
| scaled_radius_of_gyration | 0.585339 | 0.936122 | 0.706139 | 0.555172 | 0.151762 | 0.397505 | 0.800928 | -0.766137 | 0.798280 | 0.866495 | 0.787207 | 0.785500 | 1.000000 | 0.215580 | 0.161408 | -0.049536 | -0.226805 | -0.117815 |
| scaled_radius_of_gyration.1 | -0.249401 | 0.079163 | -0.234076 | -0.389812 | -0.318690 | -0.334128 | 0.010510 | 0.079733 | 0.027241 | 0.053969 | 0.025153 | 0.005136 | 0.215580 | 1.000000 | -0.061284 | -0.118001 | -0.837454 | -0.903490 |
| skewness_about | 0.196574 | 0.137671 | 0.099782 | 0.036577 | -0.055916 | 0.080716 | 0.062686 | -0.044539 | 0.071039 | 0.128692 | 0.026230 | 0.064524 | 0.161408 | -0.061284 | 1.000000 | -0.039300 | 0.086466 | 0.062753 |
| skewness_about.1 | 0.157198 | -0.008412 | 0.266181 | 0.182074 | -0.024705 | 0.141116 | 0.213200 | -0.186517 | 0.215075 | 0.008364 | 0.197383 | 0.210711 | -0.049536 | -0.118001 | -0.039300 | 1.000000 | 0.071836 | 0.195483 |
| skewness_about.2 | 0.298048 | -0.112606 | 0.147905 | 0.404102 | 0.399931 | 0.081825 | 0.004585 | -0.112689 | -0.021594 | -0.106973 | 0.012477 | 0.017060 | -0.226805 | -0.837454 | 0.086466 | 0.071836 | 1.000000 | 0.894113 |
| hollows_ratio | 0.365552 | 0.038354 | 0.334637 | 0.488645 | 0.414940 | 0.413174 | 0.119962 | -0.216725 | 0.098519 | 0.076770 | 0.086058 | 0.122041 | -0.117815 | -0.903490 | 0.062753 | 0.195483 | 0.894113 | 1.000000 |
sns.pairplot(df_vehicle_clean,diag_kind='kde')
<seaborn.axisgrid.PairGrid at 0x1d5ad5fe4f0>
df_vehicle_clean.dtypes
compactness float64 circularity float64 distance_circularity float64 radius_ratio float64 pr.axis_aspect_ratio float64 max.length_aspect_ratio float64 scatter_ratio float64 elongatedness float64 pr.axis_rectangularity float64 max.length_rectangularity float64 scaled_variance float64 scaled_variance.1 float64 scaled_radius_of_gyration float64 scaled_radius_of_gyration.1 float64 skewness_about float64 skewness_about.1 float64 skewness_about.2 float64 hollows_ratio float64 dtype: object
#outliers are removed. Datapoints are scaled.most of the datapoints look to be right tailed in distribution.
#some features are multimodal as they show more than one peak value.
# PCA
# Step 1 - Create covariance matrix
covMatrix = np.cov(df_vehicle_clean,rowvar=False)
print(covMatrix)
[[ 1.00118343 0.69356106 0.79335662 0.69760892 0.13931936 0.22453393 0.81396104 -0.78958033 0.81439967 0.67694334 0.76521371 0.80179661 0.5860317 -0.20611992 0.18357512 0.15555648 0.29840068 0.36598446] [ 0.69356106 1.00118343 0.79956044 0.61793456 0.14102154 0.24825063 0.86135821 -0.82969655 0.85888644 0.96689712 0.80769712 0.83518376 0.93722987 0.06542475 0.12856699 -0.00832406 -0.1127389 0.0383995 ] [ 0.79335662 0.79956044 1.00118343 0.77092819 0.17643354 0.29884578 0.90883154 -0.91378418 0.89686274 0.77572837 0.8650733 0.87832687 0.70697452 -0.19345431 0.09318391 0.26340145 0.14807985 0.33503256] [ 0.69760892 0.61793456 0.77092819 0.92789861 0.448249 0.19935393 0.747103 -0.79934214 0.72131345 0.56075601 0.75506295 0.73230172 0.53509931 -0.31014921 0.03288426 0.17345382 0.38949155 0.47097747] [ 0.13931936 0.14102154 0.17643354 0.448249 0.50917377 0.04769821 0.14054048 -0.21475472 0.11651067 0.10745874 0.14812893 0.14140107 0.10835631 -0.18783035 -0.03723913 -0.01743433 0.28554494 0.2962612 ] [ 0.22453393 0.24825063 0.29884578 0.19935393 0.04769821 0.20148177 0.22077695 -0.22637977 0.21951513 0.28866346 0.17917707 0.20710195 0.17853263 -0.12387835 0.03381513 0.06264374 0.03675046 0.1855701 ] [ 0.81396104 0.86135821 0.90883154 0.747103 0.14054048 0.22077695 1.00118343 -0.97454851 0.99325882 0.81161704 0.95528524 0.97300953 0.80187576 0.00868597 0.05854058 0.21097449 0.00459075 0.12010425] [-0.78958033 -0.82969655 -0.91378418 -0.79934214 -0.21475472 -0.22637977 -0.97454851 1.00118343 -0.95163731 -0.7764487 -0.94096703 -0.94110805 -0.76704409 0.06589641 -0.04159415 -0.18456987 -0.11282213 -0.21698156] [ 0.81439967 0.85888644 0.89686274 0.72131345 0.11651067 0.21951513 0.99325882 -0.95163731 1.00118343 0.81426723 0.94274956 0.96711757 0.79922469 0.02251326 0.06634097 0.21282939 -0.02161984 0.09863569] [ 0.67694334 0.96689712 0.77572837 0.56075601 0.10745874 0.28866346 0.81161704 -0.7764487 0.81426723 1.00118343 0.74558839 0.78437003 0.86752059 0.04460346 0.12018176 0.00827623 -0.10709999 0.07686047] [ 0.76521371 0.80769712 0.8650733 0.75506295 0.14812893 0.17917707 0.95528524 -0.94096703 0.94274956 0.74558839 0.98054408 0.92806661 0.7799722 0.02057274 0.02424134 0.19329823 0.01236282 0.08526706] [ 0.80179661 0.83518376 0.87832687 0.73230172 0.14140107 0.20710195 0.97300953 -0.94110805 0.96711757 0.78437003 0.92806661 0.9770371 0.77688826 0.00419321 0.05952645 0.20598153 0.01687305 0.12070258] [ 0.5860317 0.93722987 0.70697452 0.53509931 0.10835631 0.17853263 0.80187576 -0.76704409 0.79922469 0.86752059 0.7799722 0.77688826 1.00118343 0.17816808 0.15073467 -0.04901893 -0.22707383 -0.11795489] [-0.20611992 0.06542475 -0.19345431 -0.31014921 -0.18783035 -0.12387835 0.00868597 0.06589641 0.02251326 0.04460346 0.02057274 0.00419321 0.17816808 0.68222855 -0.04724344 -0.09639122 -0.69212187 -0.74669788] [ 0.18357512 0.12856699 0.09318391 0.03288426 -0.03723913 0.03381513 0.05854058 -0.04159415 0.06634097 0.12018176 0.02424134 0.05952645 0.15073467 -0.04724344 0.87108728 -0.03627537 0.08074789 0.05860302] [ 0.15555648 -0.00832406 0.26340145 0.17345382 -0.01743433 0.06264374 0.21097449 -0.18456987 0.21282939 0.00827623 0.19329823 0.20598153 -0.04901893 -0.09639122 -0.03627537 0.97807043 0.07108616 0.19344181] [ 0.29840068 -0.1127389 0.14807985 0.38949155 0.28554494 0.03675046 0.00459075 -0.11282213 -0.02161984 -0.10709999 0.01236282 0.01687305 -0.22707383 -0.69212187 0.08074789 0.07108616 1.00118343 0.89517086] [ 0.36598446 0.0383995 0.33503256 0.47097747 0.2962612 0.1855701 0.12010425 -0.21698156 0.09863569 0.07686047 0.08526706 0.12070258 -0.11795489 -0.74669788 0.05860302 0.19344181 0.89517086 1.00118343]]
# Step 2- Get eigen values and eigen vector
eig_vals, eig_vecs = np.linalg.eig(covMatrix)
print('Eigen Vectors \n%s', eig_vecs)
print('\n Eigen Values \n%s', eig_vals)
Eigen Vectors %s [[-2.77601692e-01 -1.20715490e-01 -4.77122699e-02 1.50786553e-01 -3.90756707e-02 -4.10817815e-01 -3.74564188e-01 -6.65892598e-01 2.90838008e-01 -1.49031862e-01 -4.29404808e-04 1.69953195e-02 1.51322175e-02 1.04775745e-01 -2.15382468e-02 -8.75327094e-02 -4.36950678e-02 3.18467897e-02] [-2.96429791e-01 1.37077079e-01 -2.02802286e-01 -2.27454747e-02 -2.20875244e-01 2.48216530e-01 -2.08651258e-01 6.26517342e-02 3.40135274e-03 1.38537481e-01 -7.14718056e-03 -1.77395437e-01 6.85517945e-02 2.23226605e-02 7.38999920e-01 -1.36081853e-01 -2.54748759e-01 -9.15263003e-02] [-3.06714582e-01 -6.93025362e-02 7.98214105e-02 5.40144121e-02 -1.28195295e-01 2.77166589e-02 4.76835118e-01 -2.03867434e-02 1.19757211e-01 -4.95024168e-01 -2.42700978e-01 -5.10806261e-01 3.00121371e-04 -2.50590674e-01 -3.81381372e-02 1.21584735e-02 -3.38794542e-02 -4.56207954e-02] [-2.62145035e-01 -2.11267550e-01 7.68977940e-03 -1.86782033e-01 3.72129403e-01 1.71433869e-01 1.23985256e-01 -1.03532989e-01 2.43083252e-01 3.47366401e-01 -2.59538797e-01 2.27267567e-01 -1.90733616e-03 -2.23262149e-01 -1.21073919e-01 1.07733209e-01 -3.14220194e-01 -4.29097825e-01] [-6.65186787e-02 -1.80521619e-01 -9.90647870e-02 -3.04878215e-01 4.93045843e-01 5.03907622e-01 -7.20177159e-02 -3.08929754e-01 -1.25224721e-01 -1.60532468e-01 1.90162750e-01 -1.94392733e-01 -1.33113102e-02 1.04123695e-01 4.80210517e-02 -1.39677378e-02 2.21813581e-01 2.90616216e-01] [-8.38020100e-02 -4.72037700e-02 -3.05847812e-02 3.59821424e-02 -2.97893609e-01 1.67535155e-01 3.49320912e-01 -3.07372084e-01 -1.15457600e-01 -9.48790486e-02 -2.73169954e-02 4.60440747e-01 -4.87787425e-03 1.55099164e-01 2.46020694e-01 2.81088876e-01 4.43645862e-01 -2.44304792e-01] [-3.19847929e-01 5.48025040e-02 9.81163979e-02 -3.38349810e-03 6.55162501e-02 -1.48657517e-01 6.63058783e-02 6.45258051e-02 -1.68696253e-01 4.84299733e-02 1.93717832e-01 1.57832827e-02 -8.53874097e-01 8.26072250e-02 4.48899051e-02 1.57387626e-01 -1.12927088e-01 7.78537426e-02] [ 3.16410941e-01 8.69247514e-03 -6.83352507e-02 7.25440525e-02 -1.24382832e-01 9.49673701e-02 -1.48891552e-01 -1.68884656e-01 2.62963991e-01 1.83858217e-01 2.63460511e-02 -1.08513148e-01 -2.53286791e-01 -6.99234673e-01 1.36890865e-01 2.27698281e-01 2.44113656e-01 1.26538758e-01] [-3.16849231e-01 6.87857516e-02 1.00024801e-01 2.08090417e-02 3.53002565e-02 -1.66362815e-01 4.74342843e-02 1.90361315e-02 -1.51014107e-01 1.39862904e-01 3.17802011e-01 -2.79840331e-02 4.28800739e-01 -1.43194167e-01 4.90861186e-03 6.36435048e-01 -1.42004383e-01 2.81318961e-01] [-2.83326467e-01 1.22845837e-01 -2.00808289e-01 8.69388897e-03 -4.00372558e-01 2.85585537e-01 -1.84029214e-01 -1.28332300e-01 -3.09434669e-01 3.14256352e-01 -2.92466169e-01 -1.47962093e-01 -3.26092812e-02 9.84572898e-03 -5.01779317e-01 -9.72526353e-03 8.28086994e-02 1.01051914e-01] [-3.06940266e-01 5.61644786e-02 1.13843405e-01 -5.93569699e-02 1.78676056e-01 -1.78302659e-01 4.61602170e-02 2.15395330e-01 1.34036686e-01 1.27649642e-01 -4.71674590e-01 2.47076612e-01 4.35070996e-02 -6.90597134e-02 1.83684866e-01 -1.82157626e-01 3.54739139e-01 5.06654946e-01] [-3.11873528e-01 4.86932764e-02 9.61898770e-02 -1.84658229e-03 8.45234160e-02 -1.67610700e-01 1.64969951e-02 5.34115752e-02 -1.76280539e-01 1.70719349e-01 4.39402814e-01 -9.12533580e-02 1.16056799e-01 -3.02591710e-01 -5.31950822e-02 -4.25715067e-01 4.21804849e-01 -3.52292833e-01] [-2.72731912e-01 2.15092862e-01 -2.12793317e-01 -1.41910512e-02 -5.46966782e-02 2.53655900e-01 -2.70296064e-01 3.64467287e-01 4.83696652e-01 -3.63966543e-01 2.19223885e-01 2.40055223e-01 -2.34251128e-02 1.51202403e-03 -2.33829485e-01 1.22591770e-01 1.18164050e-01 -6.55905727e-02] [ 2.62601217e-02 4.41897194e-01 7.76990203e-02 -9.99631960e-03 2.31672335e-01 -1.90525017e-02 -2.70989785e-01 -1.10737744e-01 -4.65594082e-01 -4.15324384e-01 -2.56158176e-01 2.25238146e-01 1.21102952e-02 -3.32858596e-01 4.50804871e-02 3.63756052e-02 -1.18991820e-01 -1.55207643e-01] [-3.37479694e-02 -2.26460823e-02 -4.17596177e-01 8.16496891e-01 3.41832180e-01 9.13594878e-02 1.38500713e-01 3.61789253e-02 -8.20626742e-02 5.26167872e-02 -9.62045617e-03 1.95104714e-02 5.98525724e-04 4.32780357e-03 5.23453546e-03 -2.08351609e-02 3.66271426e-03 3.37239292e-02] [-5.90042964e-02 -1.14471313e-01 7.79177163e-01 4.09634979e-01 -3.03197820e-02 3.70715352e-01 -2.42553568e-01 1.77181162e-02 5.40380697e-02 2.06576395e-02 -1.87248711e-02 -1.72997715e-02 8.25401379e-03 7.33058073e-02 1.38927761e-02 8.64427711e-03 2.02569904e-02 -2.97475344e-02] [-3.08780030e-02 -5.48007364e-01 -1.26190581e-01 -2.18964954e-02 4.78051254e-02 -1.93674132e-01 -3.95684035e-01 3.18731917e-01 -2.06056711e-01 -1.07349303e-01 -2.31577220e-01 -1.84776170e-01 -2.08330158e-02 4.72145562e-02 8.60488593e-02 3.10149555e-01 2.64776271e-01 -2.57409656e-01] [-7.49387208e-02 -5.46343272e-01 -6.18936971e-02 1.17229707e-02 -2.56599552e-01 9.61219704e-02 7.05633736e-03 6.73400403e-02 -2.11897266e-01 -1.96263024e-01 1.69476084e-01 3.98270239e-01 1.08811303e-02 -3.31481796e-01 -3.17123983e-02 -2.71347902e-01 -2.92422638e-01 2.67117637e-01]] Eigen Values %s [9.36600516e+00 2.94438509e+00 1.11711573e+00 8.91015136e-01 5.18914045e-01 4.73006089e-01 2.20873799e-01 2.04188488e-01 1.11117625e-01 6.99713325e-02 5.97554761e-02 4.80962052e-02 2.97563996e-03 3.38475412e-02 1.21502967e-02 1.84989003e-02 2.32143175e-02 2.42250379e-02]
tot = sum(eig_vals)
var_exp = [( i /tot ) * 100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var_exp)
Cumulative Variance Explained [ 58.03208759 76.2755981 83.19728529 88.71804548 91.93325461 94.86401648 96.23255808 97.49771692 98.18620552 98.61975028 98.98999725 99.28800298 99.4977235 99.64782266 99.79165936 99.90627918 99.98156283 100. ]
D. Visualize Cumulative Variance Explained with Number of Components.
plt.plot(var_exp)
[<matplotlib.lines.Line2D at 0x1d5bbeced60>]
# Visually we can observe that their is steep drop in variance explained with increase in number of PC's.
# We will proceed with 10 components here. But depending on requirement 90% variation or 5 components will also do good
# Ploting
plt.figure(figsize=(10 , 5))
plt.bar(range(1, eig_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, eig_vals.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
# Using scikit learn PCA here. It does all the above steps and maps data to PCA dimensions in one shot
from sklearn.decomposition import PCA
# NOTE - we are generating only 10 PCA dimensions (dimensionality reduction from 18 to 10)
pca = PCA(n_components=10)
data_reduced = pca.fit_transform(df_vehicle_clean)
data_reduced.transpose()
array([[ 0.33224683, -1.56852114, 3.7645719 , ..., 4.81704104,
-3.25395221, -4.72103584],
[-0.27859673, -0.36409259, 0.21590525, ..., -0.06570697,
-0.97301804, 0.38513057],
[-0.27180121, -0.05382962, -1.03964362, ..., -0.65578995,
1.90591995, 1.23116296],
...,
[-0.84582922, -0.01871451, -0.26295544, ..., -0.85328062,
0.27390621, -0.18248062],
[ 0.0906114 , -0.15998665, 0.29866291, ..., -0.34854038,
0.90707068, 0.4851271 ],
[-0.01951924, -0.49480012, -0.49782395, ..., 0.44359567,
-0.28044152, 0.01098305]])
pca.components_
array([[ 0.27760169, 0.29642979, 0.30671458, 0.26214504, 0.06651868,
0.08380201, 0.31984793, -0.31641094, 0.31684923, 0.28332647,
0.30694027, 0.31187353, 0.27273191, -0.02626012, 0.03374797,
0.0590043 , 0.030878 , 0.07493872],
[-0.12071549, 0.13707708, -0.06930254, -0.21126755, -0.18052162,
-0.04720377, 0.0548025 , 0.00869248, 0.06878575, 0.12284584,
0.05616448, 0.04869328, 0.21509286, 0.44189719, -0.02264608,
-0.11447131, -0.54800736, -0.54634327],
[-0.04771227, -0.20280229, 0.07982141, 0.00768978, -0.09906479,
-0.03058478, 0.0981164 , -0.06833525, 0.1000248 , -0.20080829,
0.11384341, 0.09618988, -0.21279332, 0.07769902, -0.41759618,
0.77917716, -0.12619058, -0.0618937 ],
[ 0.15078655, -0.02274547, 0.05401441, -0.18678203, -0.30487821,
0.03598214, -0.0033835 , 0.07254405, 0.02080904, 0.00869389,
-0.05935697, -0.00184658, -0.01419105, -0.00999632, 0.81649689,
0.40963498, -0.0218965 , 0.01172297],
[-0.03907567, -0.22087524, -0.12819529, 0.3721294 , 0.49304584,
-0.29789361, 0.06551625, -0.12438283, 0.03530026, -0.40037256,
0.17867606, 0.08452342, -0.05469668, 0.23167233, 0.34183218,
-0.03031978, 0.04780513, -0.25659955],
[ 0.41081782, -0.24821653, -0.02771666, -0.17143387, -0.50390762,
-0.16753515, 0.14865752, -0.09496737, 0.16636281, -0.28558554,
0.17830266, 0.1676107 , -0.2536559 , 0.0190525 , -0.09135949,
-0.37071535, 0.19367413, -0.09612197],
[ 0.37456419, 0.20865126, -0.47683512, -0.12398526, 0.07201772,
-0.34932091, -0.06630588, 0.14889155, -0.04743428, 0.18402921,
-0.04616022, -0.016497 , 0.27029606, 0.27098979, -0.13850071,
0.24255357, 0.39568404, -0.00705634],
[-0.6658926 , 0.06265173, -0.02038674, -0.10353299, -0.30892975,
-0.30737208, 0.06452581, -0.16888466, 0.01903613, -0.1283323 ,
0.21539533, 0.05341158, 0.36446729, -0.11073774, 0.03617893,
0.01771812, 0.31873192, 0.06734004],
[ 0.29083801, 0.00340135, 0.11975721, 0.24308325, -0.12522472,
-0.1154576 , -0.16869625, 0.26296399, -0.15101411, -0.30943467,
0.13403669, -0.17628054, 0.48369665, -0.46559408, -0.08206267,
0.05403807, -0.20605671, -0.21189727],
[-0.14903186, 0.13853748, -0.49502417, 0.3473664 , -0.16053247,
-0.09487905, 0.04842997, 0.18385822, 0.1398629 , 0.31425635,
0.12764964, 0.17071935, -0.36396654, -0.41532438, 0.05261679,
0.02065764, -0.1073493 , -0.19626302]])
E. Draw a horizontal line on the above plot to highlight the threshold of 90%.
# Ploting
plt.figure(figsize=(10 , 5))
plt.bar(range(1, eig_vals.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, eig_vals.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt. axhline(y=90,linewidth=2, color='g')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
As we see from the above graph, 90% of the variation in data can be explained using 5 features.
# 5 features explain 90 % of the variance
pca_rd = PCA(n_components=5)
pca_rd.fit(XScaled)
PCA(n_components=5)
XScaled_pca = pd.DataFrame(pca_rd.transform(XScaled))
print("shape after dimensionality reduction:", XScaled_pca.shape)
XScaled_pca.head()
shape after dimensionality reduction: (846, 5)
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| 0 | 0.332247 | -0.278597 | -0.271801 | -0.263435 | -0.066647 |
| 1 | -1.568521 | -0.364093 | -0.053830 | 0.907031 | -0.524538 |
| 2 | 3.764572 | 0.215905 | -1.039644 | 0.852246 | 0.964286 |
| 3 | -1.716517 | -2.816198 | -0.474023 | -0.047198 | -0.655885 |
| 4 | -0.650169 | 1.685306 | -0.122192 | -0.058608 | 1.337780 |
sns.pairplot(XScaled_pca, diag_kind='kde')
<seaborn.axisgrid.PairGrid at 0x1d5bbf80ee0>
#After dimensionality reduction using PCA our attributes have become independent with no correlation among themselves.
#As most of them have cloud of data points with no linear kind of relationship.
Train SVM model on components selected from above step.
We get the best paramters as : C = 5, and selecting gamma=0.1 for rbf. Hence let us re-construct the model with these parameters.
svm_model_2 = svm.SVC(gamma=0.05, C=5, kernel= 'rbf')
svm_model_2.fit(X_train , y_train)
SVC(C=5, gamma=0.05)
Print Classification metrics for train data of above model and share insights.
y_predict_2 = svm_model_2.predict(X_test)
train_score_2 = svm_model_2.score(X_train,y_train)
test_score_2= svm_model_2.score(X_test, y_test)
print("SVM_model_2 score for train set:", train_score_2*100)
print("SVM_model_2 score for test set:", test_score_2*100)
SVM_model_2 score for train set: 99.1554054054054 SVM_model_2 score for test set: 96.8503937007874
target_names = ['car', 'bus', 'van']
print("\nClassification Report:\n", classification_report(y_test, y_predict_2, target_names=target_names))
Classification Report:
precision recall f1-score support
car 0.99 0.96 0.98 123
bus 0.97 0.97 0.97 71
van 0.92 0.98 0.95 60
accuracy 0.97 254
macro avg 0.96 0.97 0.97 254
weighted avg 0.97 0.97 0.97 254
#The accuracy has been improved from 95.6 to 96.85 , in test dataset even after component reduction to 5
#compared to 18 in the svm base model
A. Train another SVM on the components out of PCA. Tune the parameters to improve performance.
#Grid search to tune model parameters for SVC
from sklearn.model_selection import GridSearchCV
c_range = range(1,11)
gamma_range = [0.001,0.025,0.05,0.04,0.03,0.1,0.5,1,10]
params = dict(C=c_range, gamma=gamma_range,kernel=['linear', 'rbf'])
model = GridSearchCV(svm.SVC(), param_grid=params, verbose=1)
model.fit(X_train, y_train)
Fitting 5 folds for each of 180 candidates, totalling 900 fits
GridSearchCV(estimator=SVC(),
param_grid={'C': range(1, 11),
'gamma': [0.001, 0.025, 0.05, 0.04, 0.03, 0.1, 0.5, 1,
10],
'kernel': ['linear', 'rbf']},
verbose=1)
B. Share best Parameters observed from above step.
print("Best Hyper Parameters:\n", model.best_params_)
Best Hyper Parameters:
{'C': 7, 'gamma': 0.025, 'kernel': 'rbf'}
C. Print Classification metrics for train data of above model and share relative improvement in performance in all the models along with insights.
svm_model_3 = svm.SVC(gamma=0.025, C=7, kernel= 'rbf')
svm_model_3.fit(X_train , y_train)
SVC(C=7, gamma=0.025)
y_predict_3 = svm_model_3.predict(X_test)
train_score_3 = svm_model_3.score(X_train,y_train)
test_score_3= svm_model_3.score(X_test, y_test)
print("SVM_model_3 score for train set:", train_score_3*100)
print("SVM_model_3 score for test set:", test_score_3*100)
SVM_model_3 score for train set: 98.81756756756756 SVM_model_3 score for test set: 96.8503937007874
target_names = ['car', 'bus', 'van']
print("\nClassification Report:\n", classification_report(y_test, y_predict_3, target_names=target_names))
Classification Report:
precision recall f1-score support
car 0.99 0.96 0.98 123
bus 0.97 0.97 0.97 71
van 0.92 0.98 0.95 60
accuracy 0.97 254
macro avg 0.96 0.97 0.97 254
weighted avg 0.97 0.97 0.97 254
Explain pre-requisite/assumptions of PCA.
Principal components analysis (PCA, for short) is a variable-reduction technique that shares many similarities to exploratory factor analysis. Its aim is to reduce a larger set of variables into a smaller set of 'artificial' variables, called 'principal components', which account for most of the variance in the original variables. It is a good practice to scale all datapoints to a single scale before performing any type of clustering. PCA technique is used when: You have multiple variables that should be measured at the continuous level (although ordinal variables are very frequently used). There needs to be a linear relationship between all variables. The reason for this assumption is that a PCA is based on Pearson correlation coefficients, and as such, there needs to be a linear relationship between the variables. Your data should be suitable for data reduction. Effectively, you need to have adequate correlations between the variables in order for variables to be reduced to a smaller number of components. There should be no significant outliers. Outliers are important because these can have a disproportionate influence on your results. SPSS Statistics recommends determining outliers as component scores greater than 3 standard deviations away from the mean.
Explain advantages and limitations of PCA.
Advantages: 1.The pca techniques efficiently removes correlated features even when hundreds features are present.There is no correlation among them after the pca is done on the dataset. 2.The training time of the algorithms reduces significantly with less number of features. So, if the input dimensions are too high, then using PCA to speed up the algorithm is a reasonable choice. 3.Overfitting mainly occurs when there are too many variables in the dataset. So, PCA helps in overcoming the overfitting issue by reducing the number of features. 4.It is very hard to visualize and understand the data in high dimensions.PCA transforms a high dimensional data to low dimensional data so that it can be visualized easily.
Limitations: After implementing PCA on the dataset, the original features will turn into Principal Components. Principal Components are the linear combination of the original features. Principal Components are not as readable and interpretable as original features. You must standardize your data before implementing PCA, otherwise PCA will not be able to find the optimal Principal Components. For instance, if a feature set has data expressed in units of Kilograms, Light years, or Millions, the variance scale is huge in the training set. If PCA is applied on such a feature set, the resultant loadings for features with high variance will also be large. Hence, principal components will be biased towards features with high variance, leading to false results.Also, for standardization, all the categorical features are required to be converted into numerical features before PCA can be applied. Although Principal Components try to cover maximum variance among the features in a dataset, if we don’t select the number of Principal Components with care, it may miss some information as compared to the original list of features.